This script calculates the correlations between different cell entry selections. Either the median of all LibA selections vs the median of all LibB selections, OR all selections for a specific condition.¶

In [1]:
# this cell is tagged as parameters for `papermill` parameterization
altair_config = None
nipah_config = None

codon_variants_file = None

CHO_corr_plot_save = None
CHO_EFNB2_indiv_plot_save = None
CHO_EFNB3_indiv_plot_save = None

histogram_plot = None
In [2]:
# Parameters
nipah_config = "nipah_config.yaml"
altair_config = "data/custom_analyses_data/theme.py"
codon_variants_file = "results/variants/codon_variants.csv"
CHO_corr_plot_save = "results/images/CHO_corr_plot_save.html"
CHO_EFNB2_indiv_plot_save = "results/images/CHO_EFNB2_all_corrs.html"
CHO_EFNB3_indiv_plot_save = "results/images/CHO_EFNB3_all_corrs.html"
histogram_plot = "results/images/variants_histogram.html"
In [3]:
import math
import os
import re
import altair as alt

import numpy as np

import pandas as pd

import scipy.stats

import Bio.SeqIO
import yaml
from Bio import AlignIO
from Bio import PDB
from Bio.Align import PairwiseAligner
In [4]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

if os.getcwd() == '/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/':
    pass
    print("Already in correct directory")
else:
    os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
    print("Setup in correct directory")
Setup in correct directory
In [5]:
#altair_config = 'data/custom_analyses_data/theme.py'
#nipah_config = 'nipah_config.yaml'
#codon_variants_file = 'results/variants/codon_variants.csv'
#CHO_corr_plot_save
#CHO_EFNB3_corr_plot_save
#CHO_EFNB2_indiv_plot_save
#CHO_EFNB3_indiv_plot_save
In [6]:
if altair_config:
    with open(altair_config, 'r') as file:
        exec(file.read())

with open(nipah_config) as f:
    config = yaml.safe_load(f)

with open('data/func_effects_config.yml') as y:
    config_func = yaml.safe_load(y)
In [7]:
cho_efnb2_low_selections = config_func['avg_func_effects']['CHO_EFNB2_low']['selections']
LibA_CHO_EFNB2 = [selection + '_func_effects.csv' for selection in cho_efnb2_low_selections if 'LibA' in selection and 'LibB' not in selection]
LibB_CHO_EFNB2 = [selection + '_func_effects.csv' for selection in cho_efnb2_low_selections if 'LibB' in selection and 'LibA' not in selection]

cho_efnb3_low_selections = config_func['avg_func_effects']['CHO_EFNB3_low']['selections']
LibA_CHO_EFNB3 = [selection + '_func_effects.csv' for selection in cho_efnb3_low_selections if 'LibA' in selection and 'LibB' not in selection]
LibB_CHO_EFNB3 = [selection + '_func_effects.csv' for selection in cho_efnb3_low_selections if 'LibB' in selection and 'LibA' not in selection]

Calculate correlations for LibA and LibB for CHO-EFNB2 cell entry selections¶

In [8]:
path = 'results/func_effects/by_selection/'
def process_func_selections(library,library_name):
    df_list = []
    clock = 1
    for file_name in library:
        file_path = os.path.join(path, file_name)

        fixed_name = file_name.replace('_func_effects.csv', '')
        
        # Read the current CSV file
        func_scores = pd.read_csv(file_path)
        
        func_scores_renamed = func_scores.rename(columns={'functional_effect': f'functional_effect_{clock}','times_seen': f'times_seen_{clock}'})
        func_scores_renamed.drop(['latent_phenotype_effect'],axis=1,inplace=True)
        
        # Append the dataframe to the list
        df_list.append(func_scores_renamed)
        clock = clock + 1
    
    # Merge all dataframes on 'site' and 'mutant'
    merged_df = df_list[0]
    for df in df_list[1:]:
        merged_df = pd.merge(merged_df, df, on=['site', 'mutant','wildtype'], how='outer')
    
    #Calculate median values
    lib_columns_func = [col for col in merged_df.columns if 'functional_effect' in col]
    merged_df[f'median_effect_{library_name}'] = merged_df[lib_columns_func].median(axis=1)
    lib_columns_times_seen = [col for col in merged_df.columns if 'times_seen' in col]
    merged_df[f'median_times_seen_{library_name}'] = merged_df[lib_columns_times_seen].median(axis=1)
    #Now drop columns
    lib_columns = [col for col in merged_df.columns if re.search(r'_\d+', col)]
    merged_df = merged_df.drop(columns=lib_columns)
    return merged_df

A_selections_E2 = process_func_selections(LibA_CHO_EFNB2,'LibA')
B_selections_E2 = process_func_selections(LibB_CHO_EFNB2,'LibB')

A_selections_E3 = process_func_selections(LibA_CHO_EFNB3,'LibA')
B_selections_E3 = process_func_selections(LibB_CHO_EFNB3,'LibB')

def merge_selections(A_selections,B_selections):
    merged_selections = pd.merge(A_selections,B_selections,on=['wildtype','site','mutant'],how='inner')
    
    #make one times seen column for slider
    lib_columns_times_seen = [col for col in merged_selections.columns if 'times_seen' in col]
    merged_selections['times_seen'] = merged_selections[lib_columns_times_seen].median(axis=1)
    return merged_selections

CHO_EFNB2_merged = merge_selections(A_selections_E2,B_selections_E2)
CHO_EFNB2_merged['cell_type'] = 'CHO-EFNB2'
CHO_EFNB3_merged = merge_selections(A_selections_E3,B_selections_E3)
CHO_EFNB3_merged['cell_type'] = 'CHO-EFNB3'

both_selections = pd.concat([CHO_EFNB2_merged, CHO_EFNB3_merged])

def make_chart_median(df,title):
    slider = alt.binding_range(min=1, max=25, step=1, name="times_seen")
    selector = alt.param(name="SelectorName", value=1, bind=slider)
    
    empty = []
    variant_selector = alt.selection_point(
        on="mouseover",
        empty=False,
        fields=["site","mutant"],
        value=1
    )
  
    df = df[
        (df['median_effect_LibA'].notna()) &
        (df['median_effect_LibB'].notna())
    ]
    size = df.shape[0]
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(df[f'median_effect_LibA'], df[f'median_effect_LibB'])
    r_value = float(r_value)
    print(f'{r_value:.2f}')

    for selection in ['CHO-EFNB2','CHO-EFNB3']:
        print(selection)
        tmp_df = df[df['cell_type'] == selection]            
        chart = alt.Chart(tmp_df,title=f'Entry in {selection} cells').mark_circle(size=15, color='black').encode(
            x=alt.X('median_effect_LibA',title='LibA Cell Entry'),
            y=alt.Y('median_effect_LibB',title='LibB Cell Entry'),
            tooltip=['site','wildtype','mutant','times_seen'],
            size=alt.condition(variant_selector, alt.value(100),alt.value(15)),
            color=alt.condition(alt.datum.times_seen < selector, alt.value('transparent'), alt.value('black')),
            opacity=alt.condition(variant_selector, alt.value(1),alt.value(0.2)),
        ).properties(
            height=300,
            width=300
        )#.add_params(variant_selector)
        empty.append(chart)
    combined_effect_chart = alt.hconcat(*empty).resolve_scale(y='shared', x='shared', color='independent').add_params(variant_selector,selector)
    return combined_effect_chart

CHO_EFNB2_corr_plot = make_chart_median(both_selections,'CHO-EFNB2')
CHO_EFNB2_corr_plot.display()
CHO_EFNB2_corr_plot.save(CHO_corr_plot_save)
0.92
CHO-EFNB2
CHO-EFNB3

Make correlations between individual selections¶

In [9]:
#path = 'results/func_effects/by_selection/'
def process_individ_selections(library):
    df_list = []
    clock = 1
    for file_name in library:
        file_path = os.path.join(path, file_name)
        print(f"Processing file: {file_path}")
        
        #fixed_name = file_name.replace('_func_effects.csv', '')
        
        # Read the current CSV file
        func_scores = pd.read_csv(file_path)
        #display(func_scores.head(2))
        #func_scores = func_scores[func_scores['times_seen'] >= config['func_times_seen_cutoff']]
        func_scores_renamed = func_scores.rename(columns={'functional_effect': f'functional_effect_{clock}','times_seen': f'times_seen_{clock}'})
        func_scores_renamed.drop(['latent_phenotype_effect'],axis=1,inplace=True)
        
        # Append the dataframe to the list
        df_list.append(func_scores_renamed)
        clock = clock + 1
    
    # Merge all dataframes on 'site' and 'mutant'
    merged_df = df_list[0]
    for df in df_list[1:]:
        merged_df = pd.merge(merged_df, df, on=['site', 'mutant','wildtype'], how='outer')
    # Make list of how many selections are done for later correlation plots
    lib_size = len(library)
    number_list = [i for i in range(1, lib_size+1)]
    return merged_df,number_list

CHO_EFNB2_indiv,lib_size_EFNB2 = process_individ_selections(LibA_CHO_EFNB2+LibB_CHO_EFNB2)
CHO_EFNB3_indiv,lib_size_EFNB3 = process_individ_selections(LibA_CHO_EFNB3+LibB_CHO_EFNB3)

def make_chart(df,number_list):
    empty_list = []
    for i in number_list:
        other_empty_list = []
        for j in number_list:
            df = df[
                (df[f'times_seen_{i}'] >= config['func_times_seen_cutoff']) & 
                (df[f'times_seen_{j}'] >= config['func_times_seen_cutoff']) &
                (df[f'functional_effect_{i}'].notna()) &
                (df[f'functional_effect_{j}'].notna())
            ]
            #slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(df[f'functional_effect_{i}'], df[f'functional_effect_{j}'])
            #r_value = float(r_value)
            #print(f'{r_value:.2f}')
            chart = alt.Chart(df).mark_circle(size=10, color='black', opacity=0.2).encode(
                x=alt.X(f'functional_effect_{i}'),
                y=alt.Y(f'functional_effect_{j}'),
                tooltip=['site','wildtype','mutant'],
            ).properties(
                height=alt.Step(10),
                width=alt.Step(10)
            )
            other_empty_list.append(chart)
        combined_effect_chart = alt.hconcat(*other_empty_list).resolve_scale(y='shared', x='shared', color='independent')
        empty_list.append(combined_effect_chart)
    final_combined_chart = alt.vconcat(*empty_list).resolve_scale(y='shared', x='shared', color='independent')
    return final_combined_chart

CHO_EFNB2_indiv_plot = make_chart(CHO_EFNB2_indiv,lib_size_EFNB2)
#CHO_EFNB2_indiv_plot.display()
CHO_EFNB2_indiv_plot.save(CHO_EFNB2_indiv_plot_save)
CHO_EFNB3_indiv_plot = make_chart(CHO_EFNB3_indiv,lib_size_EFNB3)
CHO_EFNB3_indiv_plot.save(CHO_EFNB3_indiv_plot_save)
Processing file: results/func_effects/by_selection/LibA-231112-CHO-EFNB2-BA6-nac_func_effects.csv
Processing file: results/func_effects/by_selection/LibA-231207-CHO-EFNB2-BA6-1_func_effects.csv
Processing file: results/func_effects/by_selection/LibA-231207-CHO-EFNB2-BA6-2_func_effects.csv
Processing file: results/func_effects/by_selection/LibA-231207-CHO-EFNB2-BA6-3_func_effects.csv
Processing file: results/func_effects/by_selection/LibA-231207-CHO-EFNB2-BA6-pool_func_effects.csv
Processing file: results/func_effects/by_selection/LibA-231222-CHO-EFNB2-BA6-nac_diffVSV_func_effects.csv
Processing file: results/func_effects/by_selection/LibB-231112-CHO-EFNB2-BA6-nac_diff_VSV_func_effects.csv
Processing file: results/func_effects/by_selection/LibB-231116-CHO-BA6_PREV_POOL_func_effects.csv
Processing file: results/func_effects/by_selection/LibA-230725-CHO-EFNB3-C6-nac-diffVSV_func_effects.csv
Processing file: results/func_effects/by_selection/LibA-230916-CHO-EFNB2-BA6-nac_diffVSV_func_effects.csv
Processing file: results/func_effects/by_selection/LibA-231024-CHO-EFNB3-C6-nac_func_effects.csv
Processing file: results/func_effects/by_selection/LibB-230720-CHO-C6-nac-VSV_func_effects.csv
Processing file: results/func_effects/by_selection/LibB-230815-CHO-C6-nac_func_effects.csv
Processing file: results/func_effects/by_selection/LibB-230818-CHO-C6-nac_func_effects.csv
Processing file: results/func_effects/by_selection/LibB-231116-CHO-C6_PREV_POOL_func_effects.csv

Now make histogram of variants¶

In [10]:
codon_variants = pd.read_csv(codon_variants_file)
display(codon_variants.head(3))
unique_barcodes_per_library = codon_variants.groupby('library')['barcode'].nunique()
display(unique_barcodes_per_library)
target library barcode variant_call_support codon_substitutions aa_substitutions n_codon_substitutions n_aa_substitutions
0 gene LibA AAAAAAAAAAAAAGAA 5 ACC461ACT ATC475AGC I475S 2 1
1 gene LibA AAAAAAAAAAACCCAT 36 GCG16GAG CAG23GAG A16E Q23E 2 2
2 gene LibA AAAAAAAAAAAGTTTC 6 TAC319CCC Y319P 1 1
library
LibA    72252
LibB    52162
Name: barcode, dtype: int64
In [11]:
def calculate_fraction(library):
    total_A = codon_variants[codon_variants['library'] == library].shape[0]
    for number in range(5):
        fraction = codon_variants[(codon_variants['library'] == library) & (codon_variants['n_aa_substitutions'] == number)].shape[0]
        print(f'For {library}, the fraction of sites with {number} mutations relative to WT are: {fraction/total_A:.2f}')

calculate_fraction('LibA')
calculate_fraction('LibB')
For LibA, the fraction of sites with 0 mutations relative to WT are: 0.11
For LibA, the fraction of sites with 1 mutations relative to WT are: 0.64
For LibA, the fraction of sites with 2 mutations relative to WT are: 0.21
For LibA, the fraction of sites with 3 mutations relative to WT are: 0.03
For LibA, the fraction of sites with 4 mutations relative to WT are: 0.00
For LibB, the fraction of sites with 0 mutations relative to WT are: 0.11
For LibB, the fraction of sites with 1 mutations relative to WT are: 0.65
For LibB, the fraction of sites with 2 mutations relative to WT are: 0.20
For LibB, the fraction of sites with 3 mutations relative to WT are: 0.03
For LibB, the fraction of sites with 4 mutations relative to WT are: 0.00
In [12]:
def plot_histogram(df):
    df = df.drop(columns=['barcode','target','variant_call_support','codon_substitutions','aa_substitutions','n_codon_substitutions'])
    chart = alt.Chart(df).mark_bar(color='black').encode(
        alt.X("n_aa_substitutions:N",title='# of AA Substitutions'), 
        alt.Y('count()', title='Count',axis=alt.Axis(grid=True)), # count() is a built-in aggregation to count rows in each bin
        column=alt.Column('library',title=None)
    ).properties(
        width=300, 
        height=300
    )
    return chart

histogram = plot_histogram(codon_variants)
histogram.display()
histogram.save(histogram_plot)
In [13]:
empty_list = []
for i in cho_efnb2_low_selections:
    j = i + '_func_scores.csv'
    tmp_df = pd.read_csv(f'results/func_scores/{j}')
    tmp_df['selection'] = i
    empty_list.append(tmp_df)

total_df = pd.concat(empty_list)
display(total_df.head(3))
def classify_mutation(row):
    if isinstance(row['aa_substitutions'], str) and '*' in row['aa_substitutions']:
        return 'stop'
    elif row['n_aa_substitutions'] == 0:
        if row['n_codon_substitutions'] >= 1:
            return 'synonymous'
        else:
            return 'wildtype'
    elif row['n_aa_substitutions'] == 1:
        return '1 nonsynonymous'
    elif row['n_aa_substitutions'] >= 2:
        return '>2 nonsynonymous'

# Apply the function to each row in the dataframe to create the new column
total_df['mutation_class'] = total_df.apply(classify_mutation, axis=1)
display(total_df)

result_df = total_df.groupby('barcode').agg(
    func_score=('func_score', 'median'),
    mutation_class=('mutation_class', 'first')
).reset_index()

display(result_df)

# Convert 'mutation_class' to a categorical type with the specified order
#result_df['mutation_class'] = result_df['mutation_class'].astype(object)
#display(result_df.dtypes)

custom_sort = ['wildtype','synonymous','1 nonsynonymous','>2 nonsynonymous','stop']
result_df['mutation_class'] = pd.Categorical(result_df['mutation_class'], categories=custom_sort, ordered=True)

chart = alt.Chart(result_df).mark_bar().encode(
    x=alt.X('func_score',bin=alt.Step(0.1),axis=alt.Axis(values=[-10,-5,0],grid=True)),
    y=alt.Y('count()',title=None,axis=alt.Axis(domain=False,ticks=False,labels=False)),
    #color=alt.Color('mean_func_score',scale=alt.Scale(scheme='yellowgreenblue')),
    row=alt.Row('mutation_class',title=None,sort=custom_sort,header=alt.Header(labelAngle=0, labelAlign='left', titleAlign='left', titleAnchor='start')),
).properties(width=200,height=100).configure_view(stroke=None).configure_axis(grid=False)#.resolve_scale(y='shared')

chart
func_score func_score_var barcode aa_substitutions n_aa_substitutions n_codon_substitutions selection
0 2.530 0.07171 TATAAACGTACTGAAT R372L 1 1 LibA-231112-CHO-EFNB2-BA6-nac
1 2.393 0.06118 TACTTGAATCAGAATC L526G 1 1 LibA-231112-CHO-EFNB2-BA6-nac
2 2.315 0.05853 CCTGAATAGAGATTGC N277Q 1 1 LibA-231112-CHO-EFNB2-BA6-nac
func_score func_score_var barcode aa_substitutions n_aa_substitutions n_codon_substitutions selection mutation_class
0 2.530 0.07171 TATAAACGTACTGAAT R372L 1 1 LibA-231112-CHO-EFNB2-BA6-nac 1 nonsynonymous
1 2.393 0.06118 TACTTGAATCAGAATC L526G 1 1 LibA-231112-CHO-EFNB2-BA6-nac 1 nonsynonymous
2 2.315 0.05853 CCTGAATAGAGATTGC N277Q 1 1 LibA-231112-CHO-EFNB2-BA6-nac 1 nonsynonymous
3 2.300 0.03186 CAACGAATGTCCTGAG NaN 0 0 LibA-231112-CHO-EFNB2-BA6-nac wildtype
4 2.240 0.03337 CAACCAGAATGATAAC NaN 0 1 LibA-231112-CHO-EFNB2-BA6-nac synonymous
... ... ... ... ... ... ... ... ...
48672 -11.290 4.16600 AGTTAGGATTCCAACC L577H A594T 2 2 LibB-231116-CHO-BA6_PREV_POOL >2 nonsynonymous
48673 -11.410 4.16500 ATAAGAGTAGATACGG T117R 1 1 LibB-231116-CHO-BA6_PREV_POOL 1 nonsynonymous
48674 -11.480 4.16500 ATACAATACTGCCTAT A594N 1 1 LibB-231116-CHO-BA6_PREV_POOL 1 nonsynonymous
48675 -11.490 4.16500 TGATTACGGTAGATCG Q88K G128E V469L 3 3 LibB-231116-CHO-BA6_PREV_POOL >2 nonsynonymous
48676 -12.630 4.16400 GATCCGCGCAGCAATT T114C 1 1 LibB-231116-CHO-BA6_PREV_POOL 1 nonsynonymous

489850 rows × 8 columns

barcode func_score mutation_class
0 AAAAAAAAAAAAAGAA -0.40630 1 nonsynonymous
1 AAAAAAAAAAGACCCG -2.21500 >2 nonsynonymous
2 AAAAAAAAAAGGATTC -0.86850 >2 nonsynonymous
3 AAAAAAAAAAGTTACT -1.01200 1 nonsynonymous
4 AAAAAAAAACCTATAG -0.67630 1 nonsynonymous
... ... ... ...
118313 TTTTTTGGAGACAATC 0.49610 1 nonsynonymous
118314 TTTTTTGGCGGCCTAA 0.31700 synonymous
118315 TTTTTTTAAGACTACA -3.02750 1 nonsynonymous
118316 TTTTTTTACTCGAATG 0.09682 1 nonsynonymous
118317 TTTTTTTTCTAAAAAA -1.83700 1 nonsynonymous

118318 rows × 3 columns

Out[13]:
In [ ]: